practical

In [1]:

%pip install pandas-datareader

Requirement already satisfied: pandas-datareader in c:\users\shuos\anaconda3\lib\site-packages (0.10.0)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: lxml in c:\users\shuos\anaconda3\lib\site-packages (from pandas-datareader) (5.2.1)
Requirement already satisfied: pandas>=0.23 in c:\users\shuos\anaconda3\lib\site-packages (from pandas-datareader) (2.2.2)
Requirement already satisfied: requests>=2.19.0 in c:\users\shuos\anaconda3\lib\site-packages (from pandas-datareader) (2.32.2)
Requirement already satisfied: numpy>=1.26.0 in c:\users\shuos\anaconda3\lib\site-packages (from pandas>=0.23->pandas-datareader) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\shuos\anaconda3\lib\site-packages (from pandas>=0.23->pandas-datareader) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\shuos\anaconda3\lib\site-packages (from pandas>=0.23->pandas-datareader) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\shuos\anaconda3\lib\site-packages (from pandas>=0.23->pandas-datareader) (2023.3)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\shuos\anaconda3\lib\site-packages (from requests>=2.19.0->pandas-datareader) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\shuos\anaconda3\lib\site-packages (from requests>=2.19.0->pandas-datareader) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\shuos\anaconda3\lib\site-packages (from requests>=2.19.0->pandas-datareader) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\shuos\anaconda3\lib\site-packages (from requests>=2.19.0->pandas-datareader) (2024.8.30)
Requirement already satisfied: six>=1.5 in c:\users\shuos\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=0.23->pandas-datareader) (1.16.0)

In [2]:

%pip install BeautifulSoup

Collecting BeautifulSoupNote: you may need to restart the kernel to use updated packages.

  Using cached BeautifulSoup-3.2.2.tar.gz (32 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'

  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [7 lines of output]
      Traceback (most recent call last):
        File "<string>", line 2, in <module>
        File "<pip-setuptools-caller>", line 34, in <module>
        File "C:\Users\shuos\AppData\Local\Temp\pip-install-y827_syz\beautifulsoup_ba2a3fc17f69417f98e1bc3e12db11f6\setup.py", line 3
          "You're trying to run a very old release of Beautiful Soup under Python 3. This will not work."<>"Please use Beautiful Soup 4, available through the pip package 'beautifulsoup4'."
                                                                                                         ^^
      SyntaxError: invalid syntax
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

× Encountered error while generating package metadata.
╰─> See above for output.

note: This is an issue with the package mentioned above, not pip.
hint: See above for details.

In [3]:

%pip install bs4

Requirement already satisfied: bs4 in c:\users\shuos\anaconda3\lib\site-packages (0.0.2)
Requirement already satisfied: beautifulsoup4 in c:\users\shuos\anaconda3\lib\site-packages (from bs4) (4.12.3)
Requirement already satisfied: soupsieve>1.2 in c:\users\shuos\anaconda3\lib\site-packages (from beautifulsoup4->bs4) (2.5)
Note: you may need to restart the kernel to use updated packages.

In [4]:

import matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup
import textwrap

In [5]:

pd.read_csv(
    "https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/storms.csv", nrows=10
)

	rownames	name	year	month	day	hour	lat	long	status	category	wind	pressure	tropicalstorm_force_diameter	hurricane_force_diameter
0	1	Amy	1975	6	27	0	27.5	-79.0	tropical depression	NaN	25	1013	NaN	NaN
1	2	Amy	1975	6	27	6	28.5	-79.0	tropical depression	NaN	25	1013	NaN	NaN
2	3	Amy	1975	6	27	12	29.5	-79.0	tropical depression	NaN	25	1013	NaN	NaN
3	4	Amy	1975	6	27	18	30.5	-79.0	tropical depression	NaN	25	1013	NaN	NaN
4	5	Amy	1975	6	28	0	31.5	-78.8	tropical depression	NaN	25	1012	NaN	NaN
5	6	Amy	1975	6	28	6	32.4	-78.7	tropical depression	NaN	25	1012	NaN	NaN
6	7	Amy	1975	6	28	12	33.3	-78.0	tropical depression	NaN	25	1011	NaN	NaN
7	8	Amy	1975	6	28	18	34.0	-77.0	tropical depression	NaN	30	1006	NaN	NaN
8	9	Amy	1975	6	29	0	34.4	-75.8	tropical storm	NaN	35	1004	NaN	NaN
9	10	Amy	1975	6	29	6	34.0	-74.8	tropical storm	NaN	40	1002	NaN	NaN

In [6]:

url = "http://aeturrell.com/research"
page = requests.get(url)
page.text[:300]

'<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>\n\n<meta charset="utf-8">\n<meta name="generator" content="quarto-1.5.56">\n\n<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n\n<meta name="author" content="Arthur Turrell">\n'

In [7]:

soup = BeautifulSoup(page.text, "html.parser")
print(soup.prettify()[60000:60500])

       </div>
          <div class="project-category">
           <a href="#category=gender pay gap">
            gender pay gap
           </a>
          </div>
          <div class="project-category">
           <a href="#category=labour">
            labour
           </a>
          </div>
          <div class="project-category">
           <a href="#category=text analysis">
            text analysis
           </a>
          </div>
         </div>
         <div class="project-details-listing

In [8]:

# Get all paragraphs
all_paras = soup.find_all("p")
# Just show one of the paras
all_paras[1]

<p>Botta, Federico, Robin Lovelace, Laura Gilbert, and Arthur Turrell. "Packaging code and data for reproducible research: A case study of journey time statistics." <i>Environment and Planning B: Urban Analytics and City Science</i> (2024): 23998083241267331. doi: <a href="https://doi.org/10.1177/23998083241267331"><code>10.1177/23998083241267331</code></a></p>

In [9]:

all_paras[1].text

'Botta, Federico, Robin Lovelace, Laura Gilbert, and Arthur Turrell. "Packaging code and data for reproducible research: A case study of journey time statistics." Environment and Planning B: Urban Analytics and City Science (2024): 23998083241267331. doi: 10.1177/23998083241267331'

In [10]:

projects = soup.find_all("div", class_="project-content listing-pub-info")
projects = [x.text.strip() for x in projects]
projects[:4]

['Botta, Federico, Robin Lovelace, Laura Gilbert, and Arthur Turrell. "Packaging code and data for reproducible research: A case study of journey time statistics." Environment and Planning B: Urban Analytics and City Science (2024): 23998083241267331. doi: 10.1177/23998083241267331',
 'Kalamara, Eleni, Arthur Turrell, Chris Redl, George Kapetanios, and Sujit Kapadia. "Making text count: economic forecasting using newspaper text." Journal of Applied Econometrics 37, no. 5 (2022): 896-919. doi: 10.1002/jae.2907',
 'Turrell, A., Speigner, B., Copple, D., Djumalieva, J. and Thurgood, J., 2021. Is the UK’s productivity puzzle mostly driven by occupational mismatch? An analysis using big data on job vacancies. Labour Economics, 71, p.102013. doi: 10.1016/j.labeco.2021.102013',
 'Haldane, Andrew G., and Arthur E. Turrell. "Drawing on different disciplines: macroeconomic agent-based models." Journal of Evolutionary Economics 29 (2019): 39-66. doi: 10.1007/s00191-018-0557-5']

In [11]:

df_list = pd.read_html(
    "https://simple.wikipedia.org/wiki/FIFA_World_Cup", match="Sweden"
)
# Retrieve first and only entry from list of dataframes
df = df_list[0]
df.head()

	Years	Hosts	Winners	Score	Runner's-up	Third place	Score.1	Fourth place
0	1930 Details	Uruguay	Uruguay	4 - 2	Argentina	United States	[note 1]	Yugoslavia
1	1934 Details	Italy	Italy	2 - 1	Czechoslovakia	Germany	3 - 2	Austria
2	1938 Details	France	Italy	4 - 2	Hungary	Brazil	4 - 2	Sweden
3	1950 Details	Brazil	Uruguay	2 - 1	Brazil	Sweden	[note 2]	Spain
4	1954 Details	Switzerland	West Germany	3 - 2	Hungary	Austria	3 - 1	Uruguay

In [12]:

%pip install pdftotext

Collecting pdftotext
  Using cached pdftotext-2.2.2.tar.gz (113 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pdftotext
  Building wheel for pdftotext (setup.py): started
  Building wheel for pdftotext (setup.py): finished with status 'error'
  Running setup.py clean for pdftotext
Failed to build pdftotext
Note: you may need to restart the kernel to use updated packages.

  error: subprocess-exited-with-error
  
  × python setup.py bdist_wheel did not run successfully.
  │ exit code: 1
  ╰─> [11 lines of output]
      WARNING: pkg-config not found--guessing at poppler version.
               If the build fails, install pkg-config and try again.
      WARNING: pkg-config not found--guessing at poppler version.
               If the build fails, install pkg-config and try again.
      WARNING: pkg-config not found--guessing at poppler version.
               If the build fails, install pkg-config and try again.
      running bdist_wheel
      running build
      running build_ext
      building 'pdftotext' extension
      error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for pdftotext
ERROR: Could not build wheels for pdftotext, which is required to install pyproject.toml-based projects

In [13]:

from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [14]:

# Downloading imdb top 250 movie's data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [15]:

movies = soup.select('td.titleColumn')
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value')
        for b in soup.select('td.posterColumn span[name=ir]')]

In [16]:

# create a empty list for storing
# movie information
list = []

# Iterating over movies to extract
# each movie's details
for index in range(0, len(movies)):
    
    # Separating movie into: 'place',
    # 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"place": place,
            "movie_title": movie_title,
            "rating": ratings[index],
            "year": year,
            "star_cast": crew[index],
            }
    list.append(data)

<>:14: SyntaxWarning: invalid escape sequence '\('
<>:14: SyntaxWarning: invalid escape sequence '\('
C:\Users\shuos\AppData\Local\Temp\ipykernel_6060\3116574631.py:14: SyntaxWarning: invalid escape sequence '\('
  year = re.search('\((.*?)\)', movie_string).group(1)

In [17]:

for movie in list:
    print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +
        ') -', 'Starring:', movie['star_cast'], movie['rating'])

In [18]:

#saving the list as dataframe
#then converting into .csv file
df = pd.DataFrame(list)
df.to_csv('imdb_top_250_movies.csv',index=False)

In [19]:

from bs4 import BeautifulSoup
import requests
import re
import pandas as pd


# Downloading imdb top 250 movie's data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
movies = soup.select('td.titleColumn')
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value')
        for b in soup.select('td.posterColumn span[name=ir]')]




# create a empty list for storing
# movie information
list = []

# Iterating over movies to extract
# each movie's details
for index in range(0, len(movies)):
    
    # Separating movie into: 'place',
    # 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"place": place,
            "movie_title": movie_title,
            "rating": ratings[index],
            "year": year,
            "star_cast": crew[index],
            }
    list.append(data)

# printing movie details with its rating.
for movie in list:
    print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +
        ') -', 'Starring:', movie['star_cast'], movie['rating'])


##.......##
df = pd.DataFrame(list)
df.to_csv('imdb_top_250_movies.csv',index=False)

<>:32: SyntaxWarning: invalid escape sequence '\('
<>:32: SyntaxWarning: invalid escape sequence '\('
C:\Users\shuos\AppData\Local\Temp\ipykernel_6060\3772066867.py:32: SyntaxWarning: invalid escape sequence '\('
  year = re.search('\((.*?)\)', movie_string).group(1)

In [20]:

%pip install requests

Requirement already satisfied: requests in c:\users\shuos\anaconda3\lib\site-packages (2.32.2)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (2024.8.30)
Note: you may need to restart the kernel to use updated packages.

In [21]:

%pip install beautifulsoup4

Requirement already satisfied: beautifulsoup4 in c:\users\shuos\anaconda3\lib\site-packages (4.12.3)
Requirement already satisfied: soupsieve>1.2 in c:\users\shuos\anaconda3\lib\site-packages (from beautifulsoup4) (2.5)
Note: you may need to restart the kernel to use updated packages.

In [22]:

import requests
 
# 定义请求的 URL 和 headers
url = "https://movie.douban.com/top250"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
 
# 发送 GET 请求
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'  # 设置编码方式
html_content = response.text  # 获取网页的 HTML 内容
print("网页内容加载成功！")

网页内容加载成功！

In [23]:

from bs4 import BeautifulSoup
 
# 使用 Beautiful Soup 解析 HTML
soup = BeautifulSoup(html_content, 'html.parser')
 
# 提取电影名称、描述、评分和评价人数
movies = []
for item in soup.find_all('div', class_='item'):
    title = item.find('span', class_='title').get_text()  # 电影名称
    description = item.find('span', class_='inq')  # 电影描述
    rating = item.find('span', class_='rating_num').get_text()  # 评分
    votes = item.find('div', class_='star').find_all('span')[3].get_text()  # 评价人数
    
    # 如果没有描述，将其置为空字符串
    if description:
        description = description.get_text()
    else:
        description = ''
    
    movie = {
        "title": title,
        "description": description,
        "rating": rating,
        "votes": votes.replace('人评价', '').strip()
    }
    movies.append(movie)
 
print("数据提取成功！")

数据提取成功！

In [24]:

import csv
 
# 将数据保存到 CSV 文件
with open('douban_top250.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['title', 'description', 'rating', 'votes']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 
    writer.writeheader()  # 写入表头
    for movie in movies:
        writer.writerow(movie)  # 写入每一行数据
 
print("数据已成功保存到 douban_top250.csv")

数据已成功保存到 douban_top250.csv

In [25]:

import requests
from bs4 import BeautifulSoup
import csv
 
# 定义请求的 URL 和 headers
url = "https://movie.douban.com/top250"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
 
# 发送 GET 请求
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'  # 设置编码方式
html_content = response.text  # 获取网页的 HTML 内容
 
# 使用 Beautiful Soup 解析 HTML
soup = BeautifulSoup(html_content, 'html.parser')
 
# 提取电影名称、描述、评分和评价人数
movies = []
for item in soup.find_all('div', class_='item'):
    title = item.find('span', class_='title').get_text()  # 电影名称
    description = item.find('span', class_='inq')  # 电影描述
    rating = item.find('span', class_='rating_num').get_text()  # 评分
    votes = item.find('div', class_='star').find_all('span')[3].get_text()  # 评价人数
    
    # 如果没有描述，将其置为空字符串
    if description:
        description = description.get_text()
    else:
        description = ''
    
    movie = {
        "title": title,
        "description": description,
        "rating": rating,
        "votes": votes.replace('人评价', '').strip()
    }
    movies.append(movie)
 
# 将数据保存到 CSV 文件
with open('douban_top250.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['title', 'description', 'rating', 'votes']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 
    writer.writeheader()  # 写入表头
    for movie in movies:
        writer.writerow(movie)  # 写入每一行数据
 
print("数据已成功保存到 douban_top250.csv")

数据已成功保存到 douban_top250.csv